##1 Load data ---------------------
library(readr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(gbm)
## Loaded gbm 2.1.8.1
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(nnet)
library(e1071)
library(class)
library(reshape2)
library(RColorBrewer)
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(gains)

sleep_data <- read_csv("Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI.Category, Blood.Pressure, Sleep.Disorder
## dbl (8): Person.ID, Age, Sleep.Duration, Quality.of.Sleep, Physical.Activity...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##2 explore data ---------------------
# basic analysis
names(sleep_data)
##  [1] "Person.ID"               "Gender"                 
##  [3] "Age"                     "Occupation"             
##  [5] "Sleep.Duration"          "Quality.of.Sleep"       
##  [7] "Physical.Activity.Level" "Stress.Level"           
##  [9] "BMI.Category"            "Blood.Pressure"         
## [11] "Heart.Rate"              "Daily.Steps"            
## [13] "Sleep.Disorder"
head(sleep_data)
## # A tibble: 6 × 13
##   Person.ID Gender   Age Occupation           Sleep.Duration Quality.of.Sleep
##       <dbl> <chr>  <dbl> <chr>                         <dbl>            <dbl>
## 1         1 Male      27 Software Engineer               6.1                6
## 2         2 Male      28 Doctor                          6.2                6
## 3         3 Male      28 Doctor                          6.2                6
## 4         4 Male      28 Sales Representative            5.9                4
## 5         5 Male      28 Sales Representative            5.9                4
## 6         6 Male      28 Software Engineer               5.9                4
## # ℹ 7 more variables: Physical.Activity.Level <dbl>, Stress.Level <dbl>,
## #   BMI.Category <chr>, Blood.Pressure <chr>, Heart.Rate <dbl>,
## #   Daily.Steps <dbl>, Sleep.Disorder <chr>
str(sleep_data)
## spc_tbl_ [374 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Person.ID              : num [1:374] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                 : chr [1:374] "Male" "Male" "Male" "Male" ...
##  $ Age                    : num [1:374] 27 28 28 28 28 28 29 29 29 29 ...
##  $ Occupation             : chr [1:374] "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
##  $ Sleep.Duration         : num [1:374] 6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
##  $ Quality.of.Sleep       : num [1:374] 6 6 6 4 4 4 6 7 7 7 ...
##  $ Physical.Activity.Level: num [1:374] 42 60 60 30 30 30 40 75 75 75 ...
##  $ Stress.Level           : num [1:374] 6 8 8 8 8 8 7 6 6 6 ...
##  $ BMI.Category           : chr [1:374] "Overweight" "Normal" "Normal" "Obese" ...
##  $ Blood.Pressure         : chr [1:374] "126/83" "125/80" "125/80" "140/90" ...
##  $ Heart.Rate             : num [1:374] 77 75 75 85 85 85 82 70 70 70 ...
##  $ Daily.Steps            : num [1:374] 4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
##  $ Sleep.Disorder         : chr [1:374] "None" "None" "None" "Sleep Apnea" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Person.ID = col_double(),
##   ..   Gender = col_character(),
##   ..   Age = col_double(),
##   ..   Occupation = col_character(),
##   ..   Sleep.Duration = col_double(),
##   ..   Quality.of.Sleep = col_double(),
##   ..   Physical.Activity.Level = col_double(),
##   ..   Stress.Level = col_double(),
##   ..   BMI.Category = col_character(),
##   ..   Blood.Pressure = col_character(),
##   ..   Heart.Rate = col_double(),
##   ..   Daily.Steps = col_double(),
##   ..   Sleep.Disorder = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(sleep_data)
##    Person.ID         Gender               Age         Occupation       
##  Min.   :  1.00   Length:374         Min.   :27.00   Length:374        
##  1st Qu.: 94.25   Class :character   1st Qu.:35.25   Class :character  
##  Median :187.50   Mode  :character   Median :43.00   Mode  :character  
##  Mean   :187.50                      Mean   :42.18                     
##  3rd Qu.:280.75                      3rd Qu.:50.00                     
##  Max.   :374.00                      Max.   :59.00                     
##  Sleep.Duration  Quality.of.Sleep Physical.Activity.Level  Stress.Level  
##  Min.   :5.800   Min.   :4.000    Min.   :30.00           Min.   :3.000  
##  1st Qu.:6.400   1st Qu.:6.000    1st Qu.:45.00           1st Qu.:4.000  
##  Median :7.200   Median :7.000    Median :60.00           Median :5.000  
##  Mean   :7.132   Mean   :7.313    Mean   :59.17           Mean   :5.385  
##  3rd Qu.:7.800   3rd Qu.:8.000    3rd Qu.:75.00           3rd Qu.:7.000  
##  Max.   :8.500   Max.   :9.000    Max.   :90.00           Max.   :8.000  
##  BMI.Category       Blood.Pressure       Heart.Rate     Daily.Steps   
##  Length:374         Length:374         Min.   :65.00   Min.   : 3000  
##  Class :character   Class :character   1st Qu.:68.00   1st Qu.: 5600  
##  Mode  :character   Mode  :character   Median :70.00   Median : 7000  
##                                        Mean   :70.17   Mean   : 6817  
##                                        3rd Qu.:72.00   3rd Qu.: 8000  
##                                        Max.   :86.00   Max.   :10000  
##  Sleep.Disorder    
##  Length:374        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#2.1 explore prediction target-sleep.disorder
unique_values <- unique(sleep_data$Sleep.Disorder)
cat(paste("The outputs from the classification are:", toString(unique_values)))
## The outputs from the classification are: None, Sleep Apnea, Insomnia
table(sleep_data$Sleep.Disorder)
## 
##    Insomnia        None Sleep Apnea 
##          77         219          78
#plot
ggplot(sleep_data, aes(x=Sleep.Disorder, fill=Sleep.Disorder)) +
  geom_histogram(stat="count", position=position_dodge(width=10)) +
  scale_fill_manual(values=c('#EBDEF0', '#4A235A', '#C39BD3')) +
  labs(title='Distribution of persons have sleep disorder or not') +
  theme_minimal() +
  theme(plot.background = element_rect(fill='white'),
        panel.background = element_rect(fill='white'),
        plot.title = element_text(size=12, hjust = 0,face="bold"),
        axis.title.x = element_text(size=10),
        axis.title.y = element_text(size=10),
        axis.text.x = element_text(size=10,face="bold"),
        axis.text.y = element_text(size=10,face="bold"),
        legend.title = element_text(size=10,face="bold"))
## Warning in geom_histogram(stat = "count", position = position_dodge(width =
## 10)): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

#2.2 gender
unique_gender <- unique(sleep_data$Gender)
cat(paste("The values of Sex column are:", toString(unique_gender)))
## The values of Sex column are: Male, Female
gender_disorder_counts <- sleep_data %>%
  group_by(Sleep.Disorder, Gender) %>%
  summarise(Count = n(),.groups = 'keep')
# Calculate the percentage for each gender within each Sleep.Disorder category
gender_disorder_counts <- sleep_data %>%
  group_by(Sleep.Disorder, Gender) %>%
  summarise(Count = n(), .groups = 'keep') %>%
  group_by(Sleep.Disorder) %>%
  mutate(Total = sum(Count)) %>%
  ungroup() %>%
  mutate(Percentage = Count / Total * 100)
# Plot
ggplot(gender_disorder_counts, aes(x="", y=Percentage, fill=Gender)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  facet_wrap(~ Sleep.Disorder) +
  scale_fill_manual(values=c('#C39BD3', '#EBDEF0')) +
  labs(title="The relationship between Sex and Sleep Disorder") +
  theme_void() +
  theme(legend.position="bottom")  # Adjust legend position if desired

##3 data pre-process ---------------------

##3.1 check missing-values
sum(is.na(sleep_data))
## [1] 0
#drop person.ID
sleep_data <- sleep_data[, !names(sleep_data) %in% 'Person.ID']
#drop Heart.Rate because everyone's heart rate is within normal range
sleep_data <- sleep_data[, !names(sleep_data) %in% 'Heart.Rate']
#1:"sleep apnea" or "insomnia," and  0: no sleep problems
sleep_data$Sleep.Disorder <- as.numeric(sleep_data$Sleep.Disorder %in% c("Sleep Apnea", "Insomnia"))

##3.2 transfer Blood.Pressure
unique(sleep_data$Blood.Pressure)
##  [1] "126/83" "125/80" "140/90" "120/80" "132/87" "130/86" "117/76" "118/76"
##  [9] "128/85" "131/86" "128/84" "115/75" "135/88" "129/84" "130/85" "115/78"
## [17] "119/77" "121/79" "125/82" "135/90" "122/80" "142/92" "140/95" "139/91"
## [25] "118/75"
#categorize blood pressure
categorize_blood_pressure <- function(bp) {
  bp_parts <- strsplit(bp, split = "/")
  sys <- as.numeric(bp_parts[[1]][1])
  dias <- as.numeric(bp_parts[[1]][2])
  if (sys < 90 || dias < 60) {
    result <- 'Low'
  } else if (sys < 120 && dias < 80) {
    result <- 'Normal'
  } else if (sys >= 120 && sys < 130 && dias < 80) {
    result <- 'Elevated'
  } else if ((sys >= 130 && sys < 140) || (dias >= 80 && dias < 90)) {
    result <- 'Hypertension Stage 1'
  } else if (sys >= 140 || dias >= 90) {
    result <- 'Hypertension Stage 2'
  } else if (sys > 180 || dias > 120) {
    result <- 'Hypertensive Crisis'
  } else {
    result <- 'Unknown'
  }
  return(result)
}
sleep_data$Blood.Pressure.Category <- sapply(sleep_data$Blood.Pressure, categorize_blood_pressure)
print(unique(sleep_data$Blood.Pressure.Category))
## [1] "Hypertension Stage 1" "Hypertension Stage 2" "Normal"              
## [4] "Elevated"
#3.3 trasnfer Age into categoriable variable
# categorize age
sleep_data$Age_Group <- cut(
  x = sleep_data$Age,
  breaks = c(0, 16, 30, 45, 100),
  labels = c('Child', 'Young Adults', 'Middle-aged Adults', 'Old Adults'),
  include.lowest = TRUE # Include the lowest value in the first bin
)
head(sleep_data$Age_Group)
## [1] Young Adults Young Adults Young Adults Young Adults Young Adults
## [6] Young Adults
## Levels: Child Young Adults Middle-aged Adults Old Adults
#3.4 BMI.Category data-cleaning
sleep_data$BMI.Category[sleep_data$BMI.Category == "Normal Weight"] <- "Normal"
unique(sleep_data$BMI.Category)
## [1] "Overweight" "Normal"     "Obese"
#3.5 cut separate processing
sleep_data$Daily.Steps=cut(sleep_data$Daily.Steps,4)
sleep_data$Sleep.Duration=cut(sleep_data$Sleep.Duration,3)
sleep_data$Physical.Activity.Level=cut(sleep_data$Physical.Activity.Level,4)

#3.6 Use tag encoder
categories <- c('Gender','Occupation','Sleep.Duration',
                'Physical.Activity.Level','BMI.Category',
                'Daily.Steps','Age_Group',
                'Blood.Pressure.Category')
for (label in categories) {
  sleep_data[[label]] <- as.numeric(as.factor(sleep_data[[label]]))
}

#3.7 delete Age and Blood.Pressure column, then all the variable are numeric and classified
sleep_data1 <- sleep_data[,-c(2,9)]

#3.8 corrplot
cor_matrix <- cor(sleep_data1)
color_palette <- colorRampPalette(c("pink",'white', "#4A235A"))(200)
corrplot(cor_matrix, type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 45, col = color_palette,
         tl.cex = 0.9)
## Warning in ind1:ind2: numerical expression has 2 elements: only the first used

##4 split data ---------------------
set.seed(20231124)
train_index <- sample(seq_len(nrow(sleep_data1)), size = floor(0.60 * nrow(sleep_data1)))
train_data <- sleep_data1[train_index, ]
x_train <- as.matrix(train_data[, -which(names(train_data) == "Sleep.Disorder")])
y_train <- train_data$Sleep.Disorder

test_data <- sleep_data1[-train_index, ]
x_test <- as.matrix(test_data[, -which(names(test_data) == "Sleep.Disorder")])
y_test <- as.numeric(test_data$Sleep.Disorder)

##5 Logistic Regression Model ---------------------
#Train
LR_model <- glm(Sleep.Disorder ~ ., data=train_data, family="binomial")
#Test
LR_predictions <- predict(LR_model, newdata = as.data.frame(x_test), type = "response")
LR_class_predictions <- ifelse(LR_predictions > 0.5, 1, 0)
#Evaluate
LR_confusion <- confusionMatrix(as.factor(LR_class_predictions), as.factor(as.numeric(y_test)))
LR_confusion
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 77  6
##          1 11 56
##                                           
##                Accuracy : 0.8867          
##                  95% CI : (0.8248, 0.9326)
##     No Information Rate : 0.5867          
##     P-Value [Acc > NIR] : 6.187e-16       
##                                           
##                   Kappa : 0.7691          
##                                           
##  Mcnemar's Test P-Value : 0.332           
##                                           
##             Sensitivity : 0.8750          
##             Specificity : 0.9032          
##          Pos Pred Value : 0.9277          
##          Neg Pred Value : 0.8358          
##              Prevalence : 0.5867          
##          Detection Rate : 0.5133          
##    Detection Prevalence : 0.5533          
##       Balanced Accuracy : 0.8891          
##                                           
##        'Positive' Class : 0               
## 
#the lift chart
LR_gain <- gains(test_data$Sleep.Disorder, LR_predictions, groups=10)
summary(LR_gain)
##                   Length Class  Mode     
## depth             10     -none- numeric  
## obs               10     -none- numeric  
## cume.obs          10     -none- numeric  
## mean.resp         10     -none- numeric  
## cume.mean.resp    10     -none- numeric  
## cume.pct.of.total 10     -none- numeric  
## lift              10     -none- numeric  
## cume.lift         10     -none- numeric  
## mean.prediction   10     -none- numeric  
## min.prediction    10     -none- numeric  
## max.prediction    10     -none- numeric  
## conf               1     -none- character
## optimal            1     -none- logical  
## num.groups         1     -none- numeric  
## percents           1     -none- logical
plot(LR_gain)

LR_preds <- as.numeric(LR_predictions>0.5)
LR_truth <- test_data$Sleep.Disorder
LR_probs <- LR_predictions
LR_preds <- as.numeric(LR_probs > 0.5)
LR_nick <- cbind(LR_truth, LR_probs, LR_preds)
# plot lift chart
plot(c(0,LR_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder))~c(0,LR_gain$cume.obs),
     xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test_data$Sleep.Disorder))~c(0, dim(test_data)[1]), lty=2)

##6 XGBoost Model ---------------------
#Train
xgb_model <- xgboost(data = x_train, label = as.numeric(y_train), nrounds = 100, objective = "binary:logistic")
## [1]  train-logloss:0.512365 
## [2]  train-logloss:0.408083 
## [3]  train-logloss:0.343427 
## [4]  train-logloss:0.300811 
## [5]  train-logloss:0.272429 
## [6]  train-logloss:0.252566 
## [7]  train-logloss:0.239665 
## [8]  train-logloss:0.230768 
## [9]  train-logloss:0.222319 
## [10] train-logloss:0.215232 
## [11] train-logloss:0.210969 
## [12] train-logloss:0.204953 
## [13] train-logloss:0.200605 
## [14] train-logloss:0.197245 
## [15] train-logloss:0.194177 
## [16] train-logloss:0.192157 
## [17] train-logloss:0.190653 
## [18] train-logloss:0.189419 
## [19] train-logloss:0.187819 
## [20] train-logloss:0.186768 
## [21] train-logloss:0.185812 
## [22] train-logloss:0.184657 
## [23] train-logloss:0.183728 
## [24] train-logloss:0.182829 
## [25] train-logloss:0.181914 
## [26] train-logloss:0.181002 
## [27] train-logloss:0.180396 
## [28] train-logloss:0.179434 
## [29] train-logloss:0.178251 
## [30] train-logloss:0.177576 
## [31] train-logloss:0.176737 
## [32] train-logloss:0.176133 
## [33] train-logloss:0.175445 
## [34] train-logloss:0.174910 
## [35] train-logloss:0.174515 
## [36] train-logloss:0.174158 
## [37] train-logloss:0.173857 
## [38] train-logloss:0.173424 
## [39] train-logloss:0.173178 
## [40] train-logloss:0.172921 
## [41] train-logloss:0.172570 
## [42] train-logloss:0.172283 
## [43] train-logloss:0.171893 
## [44] train-logloss:0.171619 
## [45] train-logloss:0.171323 
## [46] train-logloss:0.171076 
## [47] train-logloss:0.170834 
## [48] train-logloss:0.170636 
## [49] train-logloss:0.170443 
## [50] train-logloss:0.170218 
## [51] train-logloss:0.169823 
## [52] train-logloss:0.169540 
## [53] train-logloss:0.169351 
## [54] train-logloss:0.169177 
## [55] train-logloss:0.168929 
## [56] train-logloss:0.168791 
## [57] train-logloss:0.168630 
## [58] train-logloss:0.168495 
## [59] train-logloss:0.168368 
## [60] train-logloss:0.168243 
## [61] train-logloss:0.168029 
## [62] train-logloss:0.167913 
## [63] train-logloss:0.167674 
## [64] train-logloss:0.167564 
## [65] train-logloss:0.167419 
## [66] train-logloss:0.167249 
## [67] train-logloss:0.167128 
## [68] train-logloss:0.167030 
## [69] train-logloss:0.166862 
## [70] train-logloss:0.166694 
## [71] train-logloss:0.166592 
## [72] train-logloss:0.166479 
## [73] train-logloss:0.166351 
## [74] train-logloss:0.166267 
## [75] train-logloss:0.166134 
## [76] train-logloss:0.166056 
## [77] train-logloss:0.165942 
## [78] train-logloss:0.165839 
## [79] train-logloss:0.165769 
## [80] train-logloss:0.165703 
## [81] train-logloss:0.165600 
## [82] train-logloss:0.165462 
## [83] train-logloss:0.165389 
## [84] train-logloss:0.165301 
## [85] train-logloss:0.165187 
## [86] train-logloss:0.165119 
## [87] train-logloss:0.165054 
## [88] train-logloss:0.164931 
## [89] train-logloss:0.164867 
## [90] train-logloss:0.164763 
## [91] train-logloss:0.164669 
## [92] train-logloss:0.164605 
## [93] train-logloss:0.164525 
## [94] train-logloss:0.164460 
## [95] train-logloss:0.164395 
## [96] train-logloss:0.164337 
## [97] train-logloss:0.164271 
## [98] train-logloss:0.164189 
## [99] train-logloss:0.164130 
## [100]    train-logloss:0.164084
#Test
xgb_predictions <- predict(xgb_model, newdata = as.matrix(x_test))
xgb_class_predictions <- ifelse(xgb_predictions > 0.5, 1, 0)
# visualize result
importance_matrix <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix)

# adjust params
params <- list(
  booster = "gbtree",
  objective = "binary:logistic",
  eta = 0.3,
  max_depth = 6
)
results <- data.frame(eta = numeric(), max_depth = integer(), test_auc_mean = numeric())
for (eta_val in seq(0.01, 0.3, by = 0.05)) {
  for (depth in 3:8) {
    params$eta <- eta_val
    params$max_depth <- depth

    cv_model <- xgb.cv(
      params = params,
      data = x_train,
      label = as.numeric(y_train),
      nrounds = 100,
      nfold = 5,
      metrics = "auc",
      early_stopping_rounds = 10,
      verbose = FALSE
    )

    best_iteration <- cv_model$best_iteration
    best_test_auc_mean <- cv_model$evaluation_log$test_auc_mean[best_iteration]
    results <- rbind(results, data.frame(eta = eta_val,
                                         max_depth = depth,
                                         test_auc_mean = best_test_auc_mean,
                                         nrounds = best_iteration))
  }
}
#plot result
ggplot(results, aes(x = max_depth, y = test_auc_mean, group = eta, color = as.factor(eta))) +
  geom_line() +
  geom_point() +
  labs(title = "Performance of Different max_depth and eta Combinations",
       x = "Max Depth",
       y = "Test AUC Mean",
       color = "Eta Value") +
  theme_minimal()

best_params <- params
best_params$eta <- 0.06
best_params$max_depth <- 5
# retrain best xgboost model
xgboost_model <- xgboost(
  params = best_params,
  data = x_train,
  label = as.numeric(y_train),
  nrounds = 100
)
## [1]  train-logloss:0.652276 
## [2]  train-logloss:0.615800 
## [3]  train-logloss:0.583111 
## [4]  train-logloss:0.553685 
## [5]  train-logloss:0.527122 
## [6]  train-logloss:0.502362 
## [7]  train-logloss:0.479853 
## [8]  train-logloss:0.459333 
## [9]  train-logloss:0.440599 
## [10] train-logloss:0.423455 
## [11] train-logloss:0.407753 
## [12] train-logloss:0.393341 
## [13] train-logloss:0.379897 
## [14] train-logloss:0.367717 
## [15] train-logloss:0.356502 
## [16] train-logloss:0.345976 
## [17] train-logloss:0.336272 
## [18] train-logloss:0.327322 
## [19] train-logloss:0.319185 
## [20] train-logloss:0.311454 
## [21] train-logloss:0.304369 
## [22] train-logloss:0.297899 
## [23] train-logloss:0.291704 
## [24] train-logloss:0.285937 
## [25] train-logloss:0.280780 
## [26] train-logloss:0.275775 
## [27] train-logloss:0.271150 
## [28] train-logloss:0.266889 
## [29] train-logloss:0.262882 
## [30] train-logloss:0.259146 
## [31] train-logloss:0.255738 
## [32] train-logloss:0.252496 
## [33] train-logloss:0.249614 
## [34] train-logloss:0.246908 
## [35] train-logloss:0.244340 
## [36] train-logloss:0.241954 
## [37] train-logloss:0.239694 
## [38] train-logloss:0.237592 
## [39] train-logloss:0.235637 
## [40] train-logloss:0.233745 
## [41] train-logloss:0.231985 
## [42] train-logloss:0.230135 
## [43] train-logloss:0.228403 
## [44] train-logloss:0.226756 
## [45] train-logloss:0.225117 
## [46] train-logloss:0.223642 
## [47] train-logloss:0.222259 
## [48] train-logloss:0.221016 
## [49] train-logloss:0.219774 
## [50] train-logloss:0.218607 
## [51] train-logloss:0.217509 
## [52] train-logloss:0.216475 
## [53] train-logloss:0.215536 
## [54] train-logloss:0.214651 
## [55] train-logloss:0.213622 
## [56] train-logloss:0.212296 
## [57] train-logloss:0.211058 
## [58] train-logloss:0.209902 
## [59] train-logloss:0.208754 
## [60] train-logloss:0.207676 
## [61] train-logloss:0.206724 
## [62] train-logloss:0.205769 
## [63] train-logloss:0.204891 
## [64] train-logloss:0.204088 
## [65] train-logloss:0.203382 
## [66] train-logloss:0.202656 
## [67] train-logloss:0.201942 
## [68] train-logloss:0.201267 
## [69] train-logloss:0.200680 
## [70] train-logloss:0.199982 
## [71] train-logloss:0.199309 
## [72] train-logloss:0.198704 
## [73] train-logloss:0.198096 
## [74] train-logloss:0.197508 
## [75] train-logloss:0.196976 
## [76] train-logloss:0.196479 
## [77] train-logloss:0.196071 
## [78] train-logloss:0.195596 
## [79] train-logloss:0.195149 
## [80] train-logloss:0.194728 
## [81] train-logloss:0.194368 
## [82] train-logloss:0.194031 
## [83] train-logloss:0.193715 
## [84] train-logloss:0.193316 
## [85] train-logloss:0.192971 
## [86] train-logloss:0.192726 
## [87] train-logloss:0.192349 
## [88] train-logloss:0.191983 
## [89] train-logloss:0.191570 
## [90] train-logloss:0.191291 
## [91] train-logloss:0.190956 
## [92] train-logloss:0.190691 
## [93] train-logloss:0.190401 
## [94] train-logloss:0.190088 
## [95] train-logloss:0.189788 
## [96] train-logloss:0.189543 
## [97] train-logloss:0.189259 
## [98] train-logloss:0.189018 
## [99] train-logloss:0.188755 
## [100]    train-logloss:0.188534
#Evaluate
xgb_confusion <- confusionMatrix(as.factor(xgb_class_predictions), as.factor(as.numeric(y_test)))
xgb_confusion
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 82  5
##          1  6 57
##                                           
##                Accuracy : 0.9267          
##                  95% CI : (0.8726, 0.9628)
##     No Information Rate : 0.5867          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8491          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9318          
##             Specificity : 0.9194          
##          Pos Pred Value : 0.9425          
##          Neg Pred Value : 0.9048          
##              Prevalence : 0.5867          
##          Detection Rate : 0.5467          
##    Detection Prevalence : 0.5800          
##       Balanced Accuracy : 0.9256          
##                                           
##        'Positive' Class : 0               
## 
#the lift chart
xgb_gain <- gains(test_data$Sleep.Disorder, xgb_predictions, groups=10)
summary(xgb_gain)
##                   Length Class  Mode     
## depth             10     -none- numeric  
## obs               10     -none- numeric  
## cume.obs          10     -none- numeric  
## mean.resp         10     -none- numeric  
## cume.mean.resp    10     -none- numeric  
## cume.pct.of.total 10     -none- numeric  
## lift              10     -none- numeric  
## cume.lift         10     -none- numeric  
## mean.prediction   10     -none- numeric  
## min.prediction    10     -none- numeric  
## max.prediction    10     -none- numeric  
## conf               1     -none- character
## optimal            1     -none- logical  
## num.groups         1     -none- numeric  
## percents           1     -none- logical
plot(xgb_gain)

xgb_preds <- as.numeric(xgb_predictions>0.5)
xgb_truth <- test_data$Sleep.Disorder
xgb_probs <- xgb_predictions
xgb_preds <- as.numeric(xgb_probs > 0.5)
xgb_nick <- cbind(xgb_truth, xgb_probs, xgb_preds)
# plot lift chart
plot(c(0,xgb_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder))~c(0,xgb_gain$cume.obs),
     xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test_data$Sleep.Disorder))~c(0, dim(test_data)[1]), lty=2)

# plot the xgboost tree
xgb.plot.tree(model=xgboost_model, trees = 0)
xgb.plot.tree(model=xgboost_model, trees = 1)
##7 SVC Model ---------------------
#Train
svc_model <- svm(Sleep.Disorder ~ ., data = train_data)
#Test
svc_predictions <- predict(svc_model, newdata = x_test)
svc_class_predictions <- ifelse(svc_predictions > 0.5, 1, 0)
#Evaluate
svc_confusion <- confusionMatrix(as.factor(svc_class_predictions), as.factor(as.numeric(y_test)))
svc_confusion
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 79  6
##          1  9 56
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8404, 0.9429)
##     No Information Rate : 0.5867          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7953          
##                                           
##  Mcnemar's Test P-Value : 0.6056          
##                                           
##             Sensitivity : 0.8977          
##             Specificity : 0.9032          
##          Pos Pred Value : 0.9294          
##          Neg Pred Value : 0.8615          
##              Prevalence : 0.5867          
##          Detection Rate : 0.5267          
##    Detection Prevalence : 0.5667          
##       Balanced Accuracy : 0.9005          
##                                           
##        'Positive' Class : 0               
## 
#the lift chart
svc_gain <- gains(test_data$Sleep.Disorder, svc_predictions, groups=10)
summary(svc_gain)
##                   Length Class  Mode     
## depth             10     -none- numeric  
## obs               10     -none- numeric  
## cume.obs          10     -none- numeric  
## mean.resp         10     -none- numeric  
## cume.mean.resp    10     -none- numeric  
## cume.pct.of.total 10     -none- numeric  
## lift              10     -none- numeric  
## cume.lift         10     -none- numeric  
## mean.prediction   10     -none- numeric  
## min.prediction    10     -none- numeric  
## max.prediction    10     -none- numeric  
## conf               1     -none- character
## optimal            1     -none- logical  
## num.groups         1     -none- numeric  
## percents           1     -none- logical
plot(svc_gain)

svc_preds <- as.numeric(svc_predictions>0.5)
svc_truth <- test_data$Sleep.Disorder
svc_probs <- svc_predictions
svc_preds <- as.numeric(svc_probs > 0.5)
svc_nick <- cbind(svc_truth, svc_probs, svc_preds)
# plot lift chart
plot(c(0,svc_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder))~c(0,xgb_gain$cume.obs),
     xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test_data$Sleep.Disorder))~c(0, dim(test_data)[1]), lty=2)

##8 Classification Tree ---------------------
#Train
train.ct <- rpart(Sleep.Disorder ~ .,data = train_data,method = "class",cp = 0,minsplit = 1)
#Plot
prp(train.ct,type = 1,extra = 1,split.font = 1,varlen = -10,box.palette = "#EBDEF0",border.col = "#4A235A")

#Inspect the cp table
train.ct$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.78494624      0 1.0000000 1.0000000 0.07929945
## 2 0.03225806      1 0.2150538 0.2150538 0.04589054
## 3 0.01612903      2 0.1827957 0.1935484 0.04374847
## 4 0.01075269      5 0.1290323 0.1827957 0.04261894
## 5 0.00000000      6 0.1182796 0.1720430 0.04144620
#Train- Predict
train.ct.pred <- predict(train.ct,train_data,type = "class")
confusionMatrix(train.ct.pred, as.factor(train_data$Sleep.Disorder), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 127   7
##          1   4  86
##                                           
##                Accuracy : 0.9509          
##                  95% CI : (0.9138, 0.9752)
##     No Information Rate : 0.5848          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8984          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9247          
##             Specificity : 0.9695          
##          Pos Pred Value : 0.9556          
##          Neg Pred Value : 0.9478          
##              Prevalence : 0.4152          
##          Detection Rate : 0.3839          
##    Detection Prevalence : 0.4018          
##       Balanced Accuracy : 0.9471          
##                                           
##        'Positive' Class : 1               
## 
#Test
test.ct.pred <- predict(train.ct,test_data,type = "class")
#Evaluate
confusionMatrix(test.ct.pred, as.factor(test_data$Sleep.Disorder), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 80  6
##          1  8 56
##                                          
##                Accuracy : 0.9067         
##                  95% CI : (0.8484, 0.948)
##     No Information Rate : 0.5867         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8085         
##                                          
##  Mcnemar's Test P-Value : 0.7893         
##                                          
##             Sensitivity : 0.9032         
##             Specificity : 0.9091         
##          Pos Pred Value : 0.8750         
##          Neg Pred Value : 0.9302         
##              Prevalence : 0.4133         
##          Detection Rate : 0.3733         
##    Detection Prevalence : 0.4267         
##       Balanced Accuracy : 0.9062         
##                                          
##        'Positive' Class : 1              
## 
# Cross-validation in RPART using xval
cv.ct <- rpart(Sleep.Disorder ~ ., data = sleep_data1, method = "class",
               cp = 0, minsplit = 1, xval = 10)
printcp(cv.ct)
## 
## Classification tree:
## rpart(formula = Sleep.Disorder ~ ., data = sleep_data1, method = "class", 
##     cp = 0, minsplit = 1, xval = 10)
## 
## Variables actually used in tree construction:
## [1] BMI.Category     Gender           Occupation       Quality.of.Sleep
## [5] Stress.Level    
## 
## Root node error: 155/374 = 0.41444
## 
## n= 374 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.7741935      0   1.00000 1.00000 0.061464
## 2 0.0258065      1   0.22581 0.22581 0.036338
## 3 0.0150538      2   0.20000 0.20000 0.034400
## 4 0.0064516      5   0.15484 0.17419 0.032291
## 5 0.0032258      7   0.14194 0.15484 0.030575
## 6 0.0000000      9   0.13548 0.14839 0.029974
##9 Random Forest ------------------------
#Train
train.rf <- randomForest(as.factor(Sleep.Disorder) ~ .,
                         data = train_data,
                         ntree = 100,
                         importance = TRUE)
# plot classification error
plot(train.rf$err.rate[,1], type = "l", xlab = "Number of Trees", ylab = "Classification Error")

#Train-decide ntree = 60
train.rf <- randomForest(as.factor(Sleep.Disorder) ~ .,
                         data = train_data,
                         ntree = 60,
                         importance = TRUE)
#Variable Importance Plot
varImpPlot(train.rf, type = 1, main="Variable Importance")

#Test
rf.pred <- predict(train.rf, test_data)
rf.pred1 <- predict(train.rf, test_data, type = "prob")
#Evaluate
confusionMatrix(rf.pred, as.factor(test_data$Sleep.Disorder), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 80  5
##          1  8 57
##                                          
##                Accuracy : 0.9133         
##                  95% CI : (0.8564, 0.953)
##     No Information Rate : 0.5867         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8226         
##                                          
##  Mcnemar's Test P-Value : 0.5791         
##                                          
##             Sensitivity : 0.9194         
##             Specificity : 0.9091         
##          Pos Pred Value : 0.8769         
##          Neg Pred Value : 0.9412         
##              Prevalence : 0.4133         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.4333         
##       Balanced Accuracy : 0.9142         
##                                          
##        'Positive' Class : 1              
## 
#the lift chart
positive_prob <- rf.pred1[,2]
actual_numeric <- as.numeric(as.factor(test_data$Sleep.Disorder)) - 1  # 将因子转换为 0 和 1
rf_gain <- gains(actual_numeric, positive_prob, groups=10)
summary(rf_gain)
##                   Length Class  Mode     
## depth             4      -none- numeric  
## obs               4      -none- numeric  
## cume.obs          4      -none- numeric  
## mean.resp         4      -none- numeric  
## cume.mean.resp    4      -none- numeric  
## cume.pct.of.total 4      -none- numeric  
## lift              4      -none- numeric  
## cume.lift         4      -none- numeric  
## mean.prediction   4      -none- numeric  
## min.prediction    4      -none- numeric  
## max.prediction    4      -none- numeric  
## conf              1      -none- character
## optimal           1      -none- logical  
## num.groups        1      -none- numeric  
## percents          1      -none- logical
plot(c(0,rf_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder)) ~ c(0,rf_gain$cume.obs),
     xlab="# Cases", ylab="Cumulative", main="Lift Chart for Random Forest", type="l")
lines(c(0,sum(test_data$Sleep.Disorder)) ~ c(0, nrow(test_data)), lty=2)

##10 KNN ----------------------
#knn model
knn.pred <- knn(train = train_data, test = test_data, cl = y_train, k = 2)
knn_cm <- confusionMatrix(data = knn.pred, reference = factor(y_test, levels = c("0", "1")))
print(knn_cm$table)
##           Reference
## Prediction  0  1
##          0 82 10
##          1  6 52
print(knn_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 82 10
##          1  6 52
##                                           
##                Accuracy : 0.8933          
##                  95% CI : (0.8326, 0.9378)
##     No Information Rate : 0.5867          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7779          
##                                           
##  Mcnemar's Test P-Value : 0.4533          
##                                           
##             Sensitivity : 0.9318          
##             Specificity : 0.8387          
##          Pos Pred Value : 0.8913          
##          Neg Pred Value : 0.8966          
##              Prevalence : 0.5867          
##          Detection Rate : 0.5467          
##    Detection Prevalence : 0.6133          
##       Balanced Accuracy : 0.8853          
##                                           
##        'Positive' Class : 0               
## 
#find best k
accuracy.df <- data.frame(k = seq(1, 20, 1), accuracy = rep(0, 20))
for(i in 1:20) {
  knn.pred <- knn(train = train_data, test = test_data, cl = y_train, k = i)
  accuracy.df[i, 2] <- confusionMatrix(factor(knn.pred , levels = c(0,1)),
                                       factor(factor(y_test, levels = c("0", "1")),
                                              levels = c(0,1)))$overall[1]
}
plot(accuracy.df[,1], accuracy.df[,2], type = "l",
     xlab = "k",
     ylab = "Accuracy")

#best k =3
#knn model- use best k
knn.pred <- knn(train = train_data, test = test_data, cl = y_train, k = 3)
knn_cm <- confusionMatrix(data = knn.pred, reference = factor(y_test, levels = c("0", "1")))
print(knn_cm$table)
##           Reference
## Prediction  0  1
##          0 79 10
##          1  9 52
print(knn_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 79 10
##          1  9 52
##                                          
##                Accuracy : 0.8733         
##                  95% CI : (0.8093, 0.922)
##     No Information Rate : 0.5867         
##     P-Value [Acc > NIR] : 1.623e-14      
##                                          
##                   Kappa : 0.7382         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.8977         
##             Specificity : 0.8387         
##          Pos Pred Value : 0.8876         
##          Neg Pred Value : 0.8525         
##              Prevalence : 0.5867         
##          Detection Rate : 0.5267         
##    Detection Prevalence : 0.5933         
##       Balanced Accuracy : 0.8682         
##                                          
##        'Positive' Class : 0              
## 
results_df <- data.frame(actual = as.numeric(as.factor(y_test)) - 1,
                         predicted = as.numeric(as.factor(knn.pred)) - 1)
results_df <- results_df[order(results_df$predicted, decreasing = TRUE),]
results_df$group <- cut(seq(1, nrow(results_df)), breaks = 10, labels = FALSE)
cumulative_response <- tapply(results_df$actual, results_df$group, function(x) sum(x))
cumulative_response_sum <- cumsum(cumulative_response)
plot(cumulative_response_sum, type = "o", xlab = "Group", ylab = "Cumulative Positive Responses",
     main = "kNN Model Response Chart")

##11 Naive Bayes ----------------------
#Train
nb <- naiveBayes(x_train, y_train)
#Test
nb_predictions <- predict(nb, x_test)
#Evaluate
nb_confusionMatrix <- table(Predicted = nb_predictions, Actual = y_test)
print(nb_confusionMatrix)
##          Actual
## Predicted  0  1
##         0 79  6
##         1  9 56
summary(nb_predictions)
##  0  1 
## 85 65
# calculate sensitivity/specificity and accuracy
calculate_metrics <- function(confusionMatrix) {
  sensitivity <- confusionMatrix[1, 1] / (confusionMatrix[1, 1] + confusionMatrix[2, 1])
  specificity <- confusionMatrix[2, 2] / (confusionMatrix[2, 2] + confusionMatrix[1, 2])
  accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
  return(list(sensitivity = sensitivity, specificity = specificity, accuracy = accuracy))
}
nb_metrics <- calculate_metrics(nb_confusionMatrix)
print(nb_metrics)
## $sensitivity
## [1] 0.8977273
## 
## $specificity
## [1] 0.9032258
## 
## $accuracy
## [1] 0.9
##12 Linear Discriminant Analysis ----------------------
#Train
lda <- lda(x_train, grouping = y_train)
#Test
lda_predictions <- predict(lda, x_test)$class
#Evaluate
lda_confusionMatrix <- table(Predicted = lda_predictions, Actual = y_test)
print(lda_confusionMatrix)
##          Actual
## Predicted  0  1
##         0 79  5
##         1  9 57
lda_metrics <- calculate_metrics(lda_confusionMatrix)
print(lda_metrics)
## $sensitivity
## [1] 0.8977273
## 
## $specificity
## [1] 0.9193548
## 
## $accuracy
## [1] 0.9066667